This document aims at exploring two datasets, one in 2016 on 6 individuals and another one in 2018 on 4 individuals. For that purpose, we need first to load the weanlingNES package to load data.
# load library
library(weanlingNES)
# load data
data("data_nes", package = "weanlingNES")
Let’s have a look at what’s inside data_nes$data_2016:
# list structure
str(data_nes$year_2016, max.level = 1, give.attr = F, no.list = T)
## $ ind_3449:Classes 'data.table' and 'data.frame': 384 obs. of 23 variables:
## $ ind_3450:Classes 'data.table' and 'data.frame': 253 obs. of 23 variables:
## $ ind_3456:Classes 'data.table' and 'data.frame': 253 obs. of 23 variables:
## $ ind_3457:Classes 'data.table' and 'data.frame': 253 obs. of 23 variables:
## $ ind_3460:Classes 'data.table' and 'data.frame': 253 obs. of 23 variables:
## $ ind_3463:Classes 'data.table' and 'data.frame': 213 obs. of 23 variables:
A list of 0
data.frames, one for each seal
For convenience, we aggregate all 0 individuals into one dataset.
# combine all individuals
data_2016 = rbindlist(data_nes$year_2016, use.name = TRUE, idcol = TRUE)
# display
DT::datatable(data_2016[sample.int(.N,100),],options=list(scrollX=T))
# raw_data
data_2016[, .(
nb_days_recorded = uniqueN(as.Date(date)),
max_depth = max(maxpress_dbars),
sst_mean = mean(sst2_c),
sst_sd = sd(sst2_c)
), by =.id] %>%
sable(caption="Summary diving information relative to each 2016 individual",
digits=2)
| .id | nb_days_recorded | max_depth | sst_mean | sst_sd |
|---|---|---|---|---|
| ind_3449 | 384 | 1118.81 | 26.54 | 129.91 |
| ind_3450 | 253 | 954.81 | 130.29 | 367.69 |
| ind_3456 | 253 | 697.63 | 125.24 | 360.39 |
| ind_3457 | 253 | 572.94 | 135.56 | 374.71 |
| ind_3460 | 253 | 832.25 | 65.24 | 249.12 |
| ind_3463 | 213 | 648.81 | 212.19 | 462.88 |
Well, it seems that sst is a bit odd. Let’s have a look at its distribution.
ggplot(data_2016, aes(x = sst2_c, fill = .id)) +
geom_histogram(show.legend = FALSE) +
facet_wrap(.id ~ .) +
theme_jjo()
Figure 1.1: Distribution of raw sst2 for the four individuals in 2016
Let’s remove any data with a sst2_c > 500.
data_2016_filter = data_2016[sst2_c < 500, ]
ggplot(data_2016_filter, aes(x = sst2_c, fill = .id)) +
geom_histogram(show.legend = FALSE) +
facet_wrap(.id ~ .) +
theme_jjo()
Figure 1.2: Distribution of filtered sst2 for the four individuals in 2016
Well,that seems to be much better! In the process of filtering out odd values, we removed 116 rows this way:
# nbrow removed
data_2016[sst2_c>500,.(nb_row_removed = .N),by=.id] %>%
sable(caption = "# of rows removed by indivduals")
| .id | nb_row_removed |
|---|---|
| ind_3449 | 4 |
| ind_3450 | 23 |
| ind_3456 | 22 |
| ind_3457 | 24 |
| ind_3460 | 10 |
| ind_3463 | 33 |
ggplot(data_2016,
aes(y = -maxpress_dbars, x=as.Date(date), col=.id)) +
geom_path(show.legend = FALSE) +
geom_point(data = data_2016[sst2_c>500,], col="black") +
scale_x_date(date_labels = "%m/%Y") +
labs(y="Pressure (dbar)", x="Date") +
facet_wrap(.id ~ .) +
theme_jjo() +
theme(axis.text.x = element_text(angle = 45, hjust=1))
Figure 1.3: Where and when the sst2 outliers occured
Well this latter plot highlights several points:
ind_3449 seems to have return ashore twice during his trackLet’s see if we can double check that using some maps.
# interactive map
leaflet() %>%
setView(lng = -122, lat = 38, zoom = 2) %>%
addTiles() %>%
addPolylines(lat = data_2016[.id == "ind_3449", latitude_degs],
lng = data_2016[.id == "ind_3449", longitude_degs],
weight = 2) %>%
addCircleMarkers(lat = data_2016[.id == "ind_3449" & sst2_c>500, latitude_degs],
lng = data_2016[.id == "ind_3449" & sst2_c>500, longitude_degs],
radius = 3,
stroke=FALSE,
color="red",
fillOpacity=1)
Figure 1.4: It is supposed to be the track of ind_3449… (red dots are the location of removed rows)
… these coordinates seem weird !
# summary of the coordinates by individuals
data_2016[, .(.id, longitude_degs, latitude_degs)] %>%
tbl_summary(by = .id) %>%
modify_caption("Summary of `longitude_degree` and `latitude_degree`")
| Characteristic | ind_3449, N = 3841 | ind_3450, N = 2531 | ind_3456, N = 2531 | ind_3457, N = 2531 | ind_3460, N = 2531 | ind_3463, N = 2131 |
|---|---|---|---|---|---|---|
| longitude_degs | -119 (-132, -69) | -124 (-144, -99) | -112 (-134, -6) | -122 (-132, -97) | -122 (-144, -85) | -121 (-134, -93) |
| latitude_degs | 39 (-67, 68) | 60 (-63, 68) | 56 (-63, 68) | 44 (-63, 68) | 59 (-63, 68) | 63 (42, 72) |
|
1
Median (IQR)
|
||||||
# distribution coordinates
ggplot(
data = melt(data_2016[, .(Longitude = longitude_degs,
Latitude = latitude_degs,
.id)], id.vars =
".id", value.name = "Coordinate"),
aes(x = Coordinate, fill = .id)
) +
geom_histogram(show.legend = F) +
facet_grid(variable ~ .id) +
theme_jjo()
Figure 1.5: Distribution of coordinates per seal
There is definitely something wrong with these coordinates (five of the seals would have crossed the equator…), but the representation of the track can also be improved! Here are the some points to explore:
longitude a part of the data seems to have the wrong sign, resulting in these distribution that appear to be cut offlatitude, well this is ensure but maybe the same problem occurLet’s try to play on coordinates’ sign to see if we can display something that makes more sense.
# interactive map
leaflet() %>%
setView(lng = -122, lat = 50, zoom = 3) %>%
addTiles() %>%
addPolylines(lat = data_2016[.id == "ind_3449", abs(latitude_degs)],
lng = data_2016[.id == "ind_3449", -abs(longitude_degs)],
weight = 2)
Figure 1.6: An attempt to display the ind_3449’s track
I’ll better ask Roxanne!
Let’s have a look at what’s inside data_nes$data_2018:
# list structure
str(data_nes$year_2018, max.level = 1, give.attr = F, no.list = T)
## $ ind_2018070:Classes 'data.table' and 'data.frame': 22393 obs. of 41 variables:
## $ ind_2018072:Classes 'data.table' and 'data.frame': 29921 obs. of 41 variables:
## $ ind_2018074:Classes 'data.table' and 'data.frame': 38608 obs. of 41 variables:
## $ ind_2018080:Classes 'data.table' and 'data.frame': 19028 obs. of 41 variables:
A list of 0
data.frames, one for each seal
For convenience, we aggregate all 0 individuals into one dataset.
# combine all individuals
data_2018 = rbindlist(data_nes$year_2018, use.name = TRUE, idcol = TRUE)
# display
DT::datatable(data_2018[sample.int(.N,100),],options=list(scrollX=T))
# raw_data
data_2018[, .(
nb_days_recorded = uniqueN(as.Date(date)),
nb_dives = .N,
maxdepth_mean = mean(maxdepth),
dduration_mean = mean(dduration),
botttime_mean = mean(botttime),
pdi_mean = mean(pdi, na.rm=T)
), by =.id] %>%
sable(caption="Summary diving information relative to each 2018 individual",
digits=2)
| .id | nb_days_recorded | nb_dives | maxdepth_mean | dduration_mean | botttime_mean | pdi_mean |
|---|---|---|---|---|---|---|
| ind_2018070 | 232 | 22393 | 305.52 | 783.27 | 243.22 | 109.55 |
| ind_2018072 | 342 | 29921 | 357.86 | 876.96 | 278.02 | 104.90 |
| ind_2018074 | 371 | 38608 | 250.67 | 686.25 | 291.89 | 302.77 |
| ind_2018080 | 215 | 19028 | 296.50 | 867.69 | 339.90 | 103.51 |
Very nice dataset :)
# build dataset to check for missing values
dataPlot = melt(data_2018[, .(.id, is.na(.SD)), .SDcol = -c(".id",
"divenumber",
"year",
"month",
"day",
"hour",
"min",
"sec",
"juldate",
"divetype",
"date")])
# add the id of rows
dataPlot[, id_row := c(1:.N), by = c("variable",".id")]
# plot
ggplot(dataPlot, aes(x = variable, y = id_row, fill = value)) +
geom_tile() +
labs(x = "Attributes", y = "Rows") +
scale_fill_manual(values = c("white", "black"),
labels = c("Real", "Missing")) +
facet_wrap(.id ~ ., scales = "free_y") +
theme_jjo() +
theme(
legend.position = "top",
axis.text.x = element_text(angle = 45, hjust = 1),
legend.key = element_rect(colour = "black")
)
Figure 2.1: Check for missing values
So far so good, only few variables seems to have missing values:
# table with percent
table_inter = data_2018[, lapply(.SD, function(x) {
round(length(x[is.na(x)]) * 100 / length(x), 1)
}), .SDcol = -c(
".id",
"divenumber",
"year",
"month",
"day",
"hour",
"min",
"sec",
"juldate",
"divetype",
"date"
)]
# find which are different from 0
cond_inter = sapply(table_inter,function(x){x==0})
# display the percentages that are over 0
table_inter[,which(cond_inter) := NULL] %>%
sable(caption="Percentage of missing values per columns having missing values!")%>%
scroll_box(width = "100%")
| lightatsurf | lattenuation | euphoticdepth | thermoclinedepth | driftrate | benthicdivevertrate | cornerindex | foragingindex | verticalspeed90perc | verticalspeed95perc |
|---|---|---|---|---|---|---|---|---|---|
| 26.3 | 89 | 62.6 | 1.3 | 0.5 | 22.7 | 75.8 | 0.5 | 0.1 | 0.1 |
Let’s have a look at all the data.
names_display = names(data_2018[, -c(
".id",
"date",
"divenumber",
"year",
"month",
"day",
"hour",
"min",
"sec",
"juldate",
"divetype"
)])
for (i in names_display) {
cat('###', i, '{-} \n')
print(
ggplot(
data = melt(data_2018[, .(.id, date, get(i))], id.vars = c(".id", "date")),
aes(x = as.Date(date), y = value, col = .id)
) +
geom_point(show.legend = FALSE, alpha = 1 / 10) +
facet_wrap(. ~ .id, scales = "free") +
scale_x_date(date_labels = "%m/%Y") +
labs(x="Date", y=i)+
theme_jjo() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
cat(' \n \n')
}
Can we find nice correlation
# compute correlation
corr_2018 = round(cor(data_2018[, names_display, with = F],
use = "pairwise.complete.obs"), 1)
# replace NA value by 0
corr_2018[is.na(corr_2018)] = 0
# compute p_values
corr_p_2018 = cor_pmat(data_2018[, names_display, with = F])
# replace NA value by 0
corr_p_2018[is.na(corr_p_2018)] = 1
# display
ggcorrplot(
corr_2018,
p.mat = corr_p_2018,
hc.order = TRUE,
method = "circle",
type = "lower",
ggtheme = theme_jjo(),
insig = "blank",
colors = c("#00AFBB", "#E7B800", "#FC4E07")
)
Figure 2.2: Correlation matrix